#include "permUtility.h"

// CUDA runtime library
#include <cuda_runtime.h>

// General uility functions for permutation

// compute the number of 1s in 64 bit string (alternative implementation)
int bitCount(uint64 i)
{
	i = i - ((i >> 1) & 0x5555555555555555);
	i = (i & 0x3333333333333333) + ((i >> 2) & 0x3333333333333333);
	i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0f;
	i = i + (i >> 8);
	i = i + (i >> 16);
	i = i + (i >> 32);
	return (int)i & 0x7f;
}

// compute the absolute value of double
double Abs(double a)
{
	return((a<0) ? -a : a);
}

// convert string to upper case
void toUpperCaseString(char* inputString, int strLen) {
	for ( int i = 0; i < strLen; i++ ) {
		inputString[i] = toupper(inputString[i]);
	}
}

// get the data size of an input list file of BOOST program
// return the number of input data files.
// DataSize is an array storing the n and p in each input file.
int GetDataSize(char *filename, int **DataSize)
{
	FILE * fp, *fp_i;
	int c, ndataset;
	time_t st,ed;
	int n, p, i, flag,ii;
	char filename_i[100];

	fp = fopen(filename,"r");
	if(fp == NULL)
	{
		printf("can't open input file %s\n",filename);
		exit(1);
	}

	ndataset = 0;
	while(!feof(fp)) {
		ndataset++;
		fscanf(fp, "%s\n", &filename_i);
	}

	*DataSize = (int *)calloc( ndataset*2, sizeof(int));

	ii = 0;
	rewind(fp);
	while(!feof(fp)) {
		ii++;
		fscanf(fp, "%s\n", &filename_i);

		fp_i = fopen(filename_i, "r");
		if(fp_i == NULL)
		{
			printf("can't open input file %s\n",filename_i);
			exit(1);
		}
		printf("start getting data size of file %d: %s\n", ii, filename_i);
		time(&st);
		//initialization
		if (ii == 1)
		{
			n = 0;//samples number

			// find the number of samples: n
			while(1)
			{
				int c = fgetc(fp_i);//read a character from the data file
				switch(c)
				{
				case '\n'://the end of line
					n++;
					break;
					// fall through,
					// count the '-1' element
				case EOF://file end
					goto out;
				default:
					;
				}
			}

		}
out:
		rewind(fp_i);//Repositions the file pointer to the beginning of a file

		// find number of variables: p
		p= 0;
		i= 0;
		flag = 1;
		while(1)
		{
			c = getc(fp_i);
			if(c=='\n') goto out2;//end of line
			if(isspace(c))
			{
				flag = 1;
			}
			/*do {
			c = getc(fp);
			if(c=='\n') goto out2;//end of line
			} while(isspace(c));//space
			*/
			if (!isspace(c) && (flag==1))
			{
				p++;//indicate the dimension of the vector
				flag = 0;
			}

		}
out2:
		fclose(fp_i);

		time(&ed);

		//	DataSize[0] = n;
		(*DataSize)[ndataset * 0 + ii - 1] = n;
		(*DataSize)[ndataset * 1 + ii - 1] += p-1;

	}

	fclose(fp);
	//printf("Data contains %d rows and %d column. \n", n, p);

	printf("cputime for getting data size: %d seconds.\n", (int) ed - st);
	return ndataset;
}




// Fast Log implmentation
// build log lookup table
void fill_icsi_log_table(const int n, float *lookup_table) {
	float numlog;
	int *const exp_ptr = ((int*)&numlog);
	int x = *exp_ptr;
	x = 0x3F800000;
	*exp_ptr = x;
	int incr = 1 << (23-n);
	int p=pow(2.0,n);
	for ( int i = 0; i < p; ++i ) {
		lookup_table[i] = log(numlog)/log(2.0);
		x += incr;
		*exp_ptr = x;
	}
}

// calculate the chi-square value of a model
float CalculateChiSquareOfModel(int* input) {
	float numerator = pow((float)(input[0]*input[3] - input[1]*input[2]),2)*(input[0]+input[1]+input[2]+input[3]);
	float denominator = ((float)(input[0]+input[1]))*(input[2]+input[3])*(input[1]+input[3])*(input[0]+input[2]);

	//printf("numerator : %f, denominator : %f", numerator, denominator);

	if ( abs(denominator) < 0.000001 ) {
		return 0.0f;
	}
	return numerator/denominator;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///// CUDA Related Functions //////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

int meetCUDARequirement() {
	#if CUDART_VERSION < 2030
		return -1;
	#else
		return 0;
	#endif
}

int initCUDA(int deviceId)
{
	int count, i;
	struct cudaDeviceProp prop;
	cudaError_t status;

	cudaGetDeviceCount(&count);
	if(count == 0) {
		fprintf(stderr, "There is no device.\n");
		return -1;
	}
	if(deviceId >=0) {
		printf("Try to set device %d.\n", deviceId);
		status = cudaSetDevice(deviceId);
		if (status == cudaSuccess) {
			printf("Set device %d success.\n", deviceId);
			return 0;
		} else {
			printf("Set device %d failed. Will try other devices.\n", deviceId);
		}
	}


	for(i = 0; i < count; i++) {
		if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
			if(prop.major >= 1) {
				break;
			}
		}
	}

	if(i == count) {
		fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
		return -1;
	}
	printf("There are %d devices, device %d will be used.\n", count, i);
	status = cudaSetDevice(i);
	if (status != cudaSuccess) 
		printf ("Set Device error\n");
	return 0;
}

DeviceProperties::DeviceProperties() {
	// Find the number of devices
	cudaGetDeviceCount(&devCount);

	if ( devCount == 0 ) {
		return;
	}

	devPropArray = (cudaDeviceProp*)malloc(sizeof(struct cudaDeviceProp)*devCount);

	for (int i = 0; i < devCount; ++i)
	{
		cudaGetDeviceProperties(&devPropArray[i], i);
	}
}

DeviceProperties::~DeviceProperties() {
	free(devPropArray);
}

int DeviceProperties::getDeviceCount() {
	return devCount;
}

cudaDeviceProp DeviceProperties::getDeviceProp(int i) {
	return devPropArray[i];
}

void DeviceProperties::printDevProp(int i)
{
	cudaDeviceProp devProp = devPropArray[i];
	printf("Major revision number:         %d\n",  devProp.major);
	printf("Minor revision number:         %d\n",  devProp.minor);
	printf("Name:                          %s\n",  devProp.name);
	printf("Total global memory:           %u\n",  devProp.totalGlobalMem);
	printf("Total shared memory per block: %u\n",  devProp.sharedMemPerBlock);
	printf("Total registers per block:     %d\n",  devProp.regsPerBlock);
	printf("Warp size:                     %d\n",  devProp.warpSize);
	printf("Maximum memory pitch:          %u\n",  devProp.memPitch);
	printf("Maximum threads per block:     %d\n",  devProp.maxThreadsPerBlock);
	for (int i = 0; i < 3; ++i)
		printf("Maximum dimension %d of block:  %d\n", i, devProp.maxThreadsDim[i]);
	for (int i = 0; i < 3; ++i)
		printf("Maximum dimension %d of grid:   %d\n", i, devProp.maxGridSize[i]);
	printf("Clock rate:                    %d\n",  devProp.clockRate);
	printf("Total constant memory:         %u\n",  devProp.totalConstMem);
	printf("Texture alignment:             %u\n",  devProp.textureAlignment);
	printf("Concurrent copy and execution: %s\n",  (devProp.deviceOverlap ? "Yes" : "No"));
	printf("Number of multiprocessors:     %d\n",  devProp.multiProcessorCount);
	printf("Kernel execution timeout:      %s\n",  (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
	return;
}

// C++ calling from function for Permutation
int permGPU(char *inputFilename, char *indexFile, char *outputFilePrefix, struct KernelParams kernelParams) {
	// Read data, only one pair of SNP
	int nPair = 0, nSample = 0, nSNP = 0;
	uint64 *genoData;
	int *position = NULL;
	double *statistics = NULL;
	double *p_value = NULL;
	int nCase, nLongIntSample;
	int i;
	// Load data.
	nLongIntSample = readDataSizeAndLoad(inputFilename, indexFile, &genoData, &nSample, &nCase, &nSNP, &position, &statistics, &p_value, kernelParams.isMarginal);
	if(nLongIntSample == -1) {
		printf("There is something wrong in the data file!\n");
		exit(0);
	}
	// precompute the wordbits (a global variable)
	for (i = 0; i<65536; i++)
	{
		wordbits[i] = bitCount(i);
		//printf("%d\n",wordbits[i]);
	}
	//cuda_SetWordBits(wordbits, 65536);
	// Create a context to run CUDA program on our CUDA-enabled NVIDIA GPU
	if( initCUDA(kernelParams.deviceId) == -1) {
		printf("Unable to initialize CUDA\n");
		return -1;
	}
	printf("CUDA initialized.\n\n");

	// Permute the label vector.
	// Count the numbers in the cells, get the joint distribution for each pair.
	// Calculate the statistics using the joint distribution.
	// All these are done in the kernel function.
	// The SNP vectors are stored in the const memory.
	// The permuted label vectors are stored in the local memory.
	//cuda_permute(genoData, nLongIntSample, nSample, nCase, nSNP);
	int code = 0;
	if(kernelParams.isMarginal == 1) {
		code = cuda_permute_marginal(genoData, nLongIntSample, nSample, nCase, nSNP, position, statistics, p_value, outputFilePrefix, kernelParams);
	} else {
		code = cuda_permute_interaction(genoData, nLongIntSample, nSample, nCase, nSNP, position, statistics, p_value, outputFilePrefix, kernelParams);
	}
	if(position)
		free(position);
	if(statistics)
		free(statistics);
	if(p_value)
		free(p_value);
	return code;
}

// read the number of significant SNPs
int loadSignificantSNPSize(char *IndexFile, int isMarginal) {
	FILE *fp = fopen(IndexFile, "r");
	int numSignificantSNP = 0;
	char tmp[100];
	if(fp == NULL)
	{
		fprintf(stderr, "can't open input file %s\n", IndexFile);
		return -1;
	}
	int pos;
	double sta;
	double p_val;
	if(isMarginal == 1) {
		while(!feof(fp)) {		
			fscanf(fp, "%d", &pos);
			fscanf(fp, "%lf", &sta);
			if(fscanf(fp, "%le", &p_val) != EOF)
				numSignificantSNP ++;
		}
	} else {
		while(!feof(fp)) {		
			fscanf(fp, "%d", &pos);
			fscanf(fp, "%d", &pos);
			fscanf(fp, "%lf", &sta);
			if(fscanf(fp, "%le", &p_val) != EOF)
				numSignificantSNP += 2;
		}
	}
	fclose(fp);
	return numSignificantSNP;
}

int readIndexFile(int *position, double *statistics, double *p_value, char *indexFile, int nSNP, int isMarginal) {
	FILE *fp = fopen(indexFile, "r");
	if(fp == NULL)
	{
		fprintf(stderr, "can't open input file %s\n", indexFile);
		return -1;
	}
	printf("start loading the marginal association file");
	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();
	int pos, pos2;
	double sta;
	double p_val;
	int i = 0;
	if(isMarginal == 1) {
		while(!feof(fp)) {
			if(i >= nSNP)
				break;
			fscanf(fp, "%d", &pos);
			fscanf(fp, "%lf", &sta);
			fscanf(fp, "%le", &p_val);
			position[i] = pos;
			statistics[i] = sta;
			p_value[i] = p_val;
			i++;
		}
	} else {
		while(!feof(fp)) {
			if(i >= nSNP / 2)
				break;
			fscanf(fp, "%d", &pos);
			fscanf(fp, "%d", &pos2);
			fscanf(fp, "%lf", &sta);
			fscanf(fp, "%le", &p_val);
			position[2 * i] = pos;
			position[2 * i + 1] = pos2;
			statistics[i] = sta;
			p_value[i] = p_val;
			i++;
		}
	}
}

// Get the data size, then load the genotype data.
// readDataSizeAndLoad(inputFilename, &genoData, &nSample, &nCase, &nSNP, &position, &statistics, &p_value);
int readDataSizeAndLoad(char *inputFilename, char *indexFile, uint64 **genoData, int *nSample, int *nCase, int *nSNP, int **pos, double **sta, double **p_val, int isMarginal) {
	// load data
	FILE *fp, *fp_i;
	time_t st, ed;
	char filename_i[100];
	int ndataset;
	int *DataSize;
	int LengthLongType=64;
	uint64 mask1 = 0x0000000000000001;
	int ncase, nctrl, nsample, nlongintsample;
	uint64 *genosample;
	int i, j, k, ii, flag, tmp, n, p;	
	int *position;
	double *statistics;
	double *p_value;
	// load the number of significant snps
	int numSignificant = loadSignificantSNPSize(indexFile, isMarginal);
	// load the positions of ths significant snps
	if(numSignificant > 0) {
		position = (int *)malloc(numSignificant * sizeof(int));
	    statistics = (double *)malloc(numSignificant * sizeof(double));
	    p_value = (double *)malloc(numSignificant * sizeof(double));
		readIndexFile(position, statistics, p_value, indexFile, numSignificant, isMarginal);
	}
	

	time(&st);
	fp = fopen(inputFilename,"r");
	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",inputFilename);
		return -1;
	}

	printf("start loading ...\n");
	
	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();

	ndataset = GetDataSize(inputFilename, &DataSize);

	n = DataSize[0];
	p = 0;
	printf("n = %d\n", n);
	for (i = 0; i<ndataset ; i++)
	{
		p += DataSize[ndataset*1 + i];
		printf("DataSize %d-th file: p[%d] = %d \n", i+1, i+1, DataSize[ndataset*1 + i]);
	}
	printf("p = %d\n",p);

	if(numSignificant != -1) {
		// There are significant snp positions specified.
		p = numSignificant;
		printf("significant p = %d\n", p);
	}

	// get ncase and nctrl
	i = 0;
	j = 0;

	ncase = 0;
	nctrl = 0;

	rewind(fp);

	// only use the first file to get ncase and nctrl
	fscanf(fp, "%s\n", &filename_i);
	printf("%s\n", filename_i);
	fp_i = fopen(filename_i, "r");

	while(!feof(fp_i)) {

		/* loop through and store the numbers into the array */
		if(j==0)
		{
			//j = 0 means readind class label y
			fscanf(fp_i, "%d", &tmp);

			if (tmp)
			{
				// tmp=1 means case
				ncase++;

			}
			else
			{
				nctrl ++;

			}
			j++;
		}
		else
		{
			fscanf(fp_i, "%d", &tmp);
			j++; //column index
			if (j==(DataSize[ndataset]+1)) // DataSize[ndataset] is the nsnp in the first dataset
			{
				j=0;
				i++; // row index
			}

		}

		if (i>=n)
		{
			break;
		}
	}
	nsample = ncase + nctrl;
	*nCase = ncase;
	*nSample = nsample;
	printf("total sample: %d (ncase = %d; nctrl = %d).\n", n, (int)ncase, (int)nctrl);


	nlongintsample = ceil( ((double) nsample)/LengthLongType);
	printf("nLongIntsample = %d.\n", nlongintsample);

	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();

	//calloc memory for bit representation
	genosample = (uint64*)calloc(3*p*nlongintsample, sizeof(uint64));

	*genoData = genosample;
	*nSNP = p;

	if(p % 2 != 0) {
		fprintf(stderr,"The number of SNPs is odd.\n");
		//return -1;
	}

	memset(*genoData, 0, 3*p*nlongintsample * sizeof(uint64));

	//load data to bit representation
	rewind(fp);

	int sigIndex = 0; // SNP index in the array position[]
	j = 0; // column index
	ii = 0; // file index
	k = 0;
	while(!feof(fp)) { 
		ii++;
		fscanf(fp, "%s\n", &filename_i);

		fp_i = fopen(filename_i, "r");
		if(fp_i == NULL)
		{
			fprintf(stderr,"can't open input file %s\n", filename_i);
			exit(1);
		}

		i = 0; //row index

		printf("Loading data in file %d: %s\n", ii, filename_i);
		
		// flush the STD_OUT for external process to read
		FLUSH_STDOUT();
		while(!feof(fp_i)) { 
			/* loop through and store the numbers into the array */

			if(j==0)
			{
				//j = 0 means read class label y
				fscanf(fp_i, "%d", &tmp);
				j++;
			}
			else
			{
				fscanf(fp_i, "%d", &tmp);
				// The SNP index is j + k - 1
				if(numSignificant != -1) {
					if(j + k - 1 == position[sigIndex]) {
						// It's a significant SNP. Add it to the bit matrix.
						genosample[nlongintsample * (3 * (sigIndex) + tmp) + (i/LengthLongType)] |= (mask1 << (i % LengthLongType));
						sigIndex++;
						//if(sigIndex >= numSignificant)
						//	break;
					}
				} else {
					// There is no significant SNP specified. So just add all SNP to the data matrix.
					genosample[nlongintsample * (3 * (j+k-1) + tmp) + (i/LengthLongType)] |= (mask1 << (i % LengthLongType));
				}

				j++; //column index
				if (j==(DataSize[ndataset + ii-1]+1))
				{
					j=0;
					sigIndex = 0;
					i++; // row index
				}

			}

			if (i>=n)
			{
				break;
			}
		}

		fclose(fp_i);
		k += DataSize[ndataset + ii-1];
	}

	fclose(fp);
	//printf("Number of numbers read: %d\n\n", n*p);
	time(&ed);
	printf("cputime for loading data: %d seconds\n", (int)ed -st);
	*pos = position;
	*sta = statistics;
	*p_val = p_value;

	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();
	free(DataSize);
	//free(position);
	//free(statistics);
	//free(p_value);
	return nlongintsample;
}


